#your code for first query
import pandas as pd
import polars as pl
from IPython.display import display, HTML
# Filter out missing or zero salary values
pdf = df.filter(df["SALARY"] > 0).select("EMPLOYMENT_TYPE_NAME", "SALARY").toPandas()
# Clean employment type names for better readability
pdf["EMPLOYMENT_TYPE_NAME"] = (
pdf["EMPLOYMENT_TYPE_NAME"]
.astype(str)
.str.replace(r"[^\x00-\x7F]+", "", regex=True)
)
#display(HTML(f"<div style='height:300px; overflow:auto'>{pdf.iloc[:10].to_html(index=False)}</div>"))
# Compute median salary for sorting
median_salaries = pdf.groupby("EMPLOYMENT_TYPE_NAME")["SALARY"].median()
display(median_salaries.to_frame().head())
# Sort employment types based on median salary in descending order
sorted_employment_types = median_salaries.sort_values(ascending=False).index
# Apply sorted categories
pdf["EMPLOYMENT_TYPE_NAME"] = pd.Categorical(
pdf["EMPLOYMENT_TYPE_NAME"],
categories=sorted_employment_types,
ordered=True
)
# Create box plot with horizontal grid lines
fig = px.box(
pdf,
x="EMPLOYMENT_TYPE_NAME",
y="SALARY",
title="Salary Distribution by Employment Type",
color_discrete_sequence=["#CC0000"], # Single neutral color
boxmode="group",
points="all" # Show all outliers
)
fig
# Improve layout, font styles, and axis labels
fig.update_layout(
title=dict(
text="Salary Distribution by Employment Type",
font=dict(size=16, family="Helvetica", color="black") # Bigger & Bold Title
),
xaxis=dict(
title=dict(text="Employment Type", font=dict(size=14, family="Helvetica", color="black",weight="bold")), # Bigger X-label
tickangle=0, # Rotate X-axis labels for readability
tickfont=dict(size=12, family="Helvetica", color="black"), # Bigger & Bold X-ticks
showline=True, # Show axis lines
linewidth=2, # Thicker axis lines
linecolor="black",
mirror=True,
showgrid=False, # Remove vertical grid lines
categoryorder="array",
categoryarray=sorted_employment_types.tolist()
),
yaxis=dict(
title=dict(text="Salary (in $1000)", font=dict(size=14, family="Helvetica", color="black",weight="bold")), # Bigger Y-label
tickvals=[0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 500000],
ticktext=["0", "50", "100", "150", "200", "250", "300", "350", "400", "450", "500"],
tickfont=dict(size=12, family="Helvetica", color="black"), # Bigger & Bold Y-ticks
showline=True,
linewidth=2,
linecolor="black",
mirror=True,
showgrid=True, # Enable light horizontal grid lines
gridcolor="lightgray", # Light shade for the horizontal grid
gridwidth=0.5 # Thin grid lines
),
font=dict(family="Helvetica", size=12, color="black"),
boxgap=0.7,
plot_bgcolor="white",
paper_bgcolor="white",
showlegend=False,
height=500,
width=850
)
# Show & export
fig.show()
fig.write_html("output/Q1.html")
fig.write_image("output/Q1.svg", width=850, height=500, scale=1)